org 100h   ; assume ax=bx=0 cx=0xff

;UNPACK:   ; ax=0xf40f bx=0x153 cx=0 si=0x1d6 di=0x356

;Prepare a table of powers of 4 (for SSE)
;[0xfff0]=0x00000000 [0xffe0]=0x01000000 ... [0xf000]=0xff000000
;[0xeff0]=0x00000000 [0xefe0]=0x01000000 ... [0xe000]=0xff000000
;                                            ...
;                                            [0x8000]=0xff000000

  pop di       ; sp=di=0
  mov ax,0x13
P int 0x10

  imul cx,sp,-16 ; ah = -value/4 (last 256: 63..0)
  mov cl,0     ; store each value 4 times
  push cx
  push di      ; sp-=4

  dec bx       ; bl=I = 0..255, L=I&7: .....LLL
  imul ax,bx,8
  and al,0x3f
;  add al,7
  mov dh,al    ; dh=R = 8*L: 00LLL000
  mul bl
  mov ch,ah    ; ch=G = 8*L*I, cl=B = 0

  mov ax,0x1010; set palette color: bl=index dh=R ch=G cl=B
  cmp sp,ax
  jne P      ; ax=sp=0x1010 bx=0xc557 cx=0x1300 dx=0x389d
  pusha      ; last pushed is di=0 (for 'ret') sp=0x1000

  fninit
  fldz             ;| t=0

;Centering segments for the 0xcccd trick: mov ax,0xcccd | mul pixel_address | add dx,segment
; segment=0x9f??: error in pixels = (segment*16-0xa0000 - ((x&0xff)-128)/256*320) % 320
; nice values:
;   0x9ff5 -2.25
; ->0x9fe0 +8       choose this one because it's divisible by 0x10
;   0x9fdf -6.75
;   0x9fca +3.5
;   0x9fb4 -1
; ----------------- 0x9fa0 is the lowest segment that can access the whole screen
;   0x9f9e -5.5
;   0x9f89 +4.75
;   0x9f73 +0.25
  mov si,0x9fe0
  mov es,si

;si=9fe0
;bx=480: bx+si=a460
;bp=300: bp+si=a2e0
%define K(x) 0xa000 + (((~x)&0xff00) >> 4)

%define K_TIME_DELTA        si-0x9fe0+bx-0x480+K(0xbc00)  ;430 bx+si-30h; -1/128
;%define K_TIME_DELTA        si-0x9fe0+bx-0x480+K(0xbd00)  ;420 bx+si-40h; -1/32

%define K_NEG_2             si-0x9fe0+bx-0x480+K(0xc000)  ;3f0 bx+si-70h; -2
%define K_NEG_HALF_SCALE    si-0x9fe0+bx-0x480+K(0xbe00)  ;410 bx+si-50h; -1/8

%define K_NEG_Z0            si-0x9fe0+bp-0x320+K(0xcf00)  ;300 bp+si+20h; -2**31

%define K_NEG_EPS           si-0x9fe0+bp-0x320+K(0xcd00)  ;320 bp+si+40h; 1/16 * 2**31
%define K_BRIGHT_MAGIC      si-0x9fe0+bp-0x320+K(0xd700)  ;280 bp+si-60h; ulp=EPS/8 = 1/512 * 2**31 => magic=2.0*2^(15+31)
;%define K_NEG_EPS           si-0x9fe0+bp-0x320+K(0xcc00)  ;330 bp+si+50h; 1/64 * 2**31
;%define K_BRIGHT_MAGIC      si-0x9fe0+bp-0x320+K(0xd600)  ;290 bp+si-50h; ulp=EPS/8 = 1/512 * 2**31 => magic=2.0*2^(15+31)

;%define K_HUE_MAGIC         si-0x9fe0+bp-0x320+K(0xd800)  ;270 bp+si-70h; ulp=2/8 * 2**31 = 1/16 * 2**31  => magic=2.0*2^(19+31)
%define K_HUE_MAGIC         si-0x9fe0+bp-0x320+K(0xd400)  ;2b0 bp+si-70h; ulp=2/8 * 2**31 = 1/16 * 2**31  => magic=2.0*2^(19+31)

%define K_TRANSLATION       si-0x9fe0+bp-0x320+K(0xce00)  ;310 bp+si-30h; -1/4 * 2**31



;For 16:9 screens: pixel aspect ratio = 1.03
;%define K_X_SCALE           K(0x3020)  ; 2.5 * 2**-32: x -> ..1.25
;%define K_Y_SCALE           K(0x2fe0)  ; 1.75 * 2**-32: y -> ..0.6836

;For 4:3 screens: pixel aspect ratio = 0.96
;%define K_X_SCALE      K(0x3f80)  ; 1.0    ; 2.0 * 2**-32: x -> ..1.0
;%define K_Y_SCALE      K(0x3f80)  ; 1.0    ; 2.0 * 2**-32: y -> ..0.7813

;For each frame: prepare rotation constants
M fld st0
  fsincos          ;| C1 S1 t
  fldl2e
  fmul st3         ;| 1.44*t C1 S1 t
  fsincos          ;| C2 S2 C1 S1 t
  fldlg2
  fmul st5         ;| 0.30*t C2 S2 C1 S1 t
  fsincos          ;| C3 S3 C2 S2 C1 S1 t

;Store each constant four times
  mov bx,0x420     ; bh=54
STORE:
  mov cl,4         ;<- ch doesn't have to be 0 on init because t=0
STORE4:
  fst dword[bx]    ;0x400 10 20 30 40 50 60 70 80
  add bl,bh        ;         C3 S3 C2 S2 C1 S1 XY, pixel data transfer
  loop STORE4
  fstp st0
  jns STORE        ; loop 6 times: bx=0x480

  fsub dword[K_TIME_DELTA] ;| t+=dt

%define COS bx
%define SIN bx+0x10

;For each 4-pixel batch:
X mov cl,4         ; bx=0x480

;Combine brightness and hue from the last batch
B shr bp,1         ; background mask
  mov ax,[bx]      ; ah=hue = orbit trap: 8..31.99 (floor(x) ~ round(x*256)>>8)
  salc             ; al=0 (background) or 0xffff (fractal)
  add al,[bx+si]   ; al=brightness = normal.Z: -1 + 0..8
  cmovnc ax,si     ; - if it was 0+x or -1+0, make it black
  aad 8            ; color = hue*8 + brightness
;  aad -8           ; color = hue*-8 + brightness
;  sub al,32
  stosb

;Store coordinates for this batch
  mov ax,0xcccd
  mul di
  add dx,si      ; 0xcccd*pixel_address + 0x9fe00000: center X and Y

;  ; 10 bytes, free si
;  mov [bx],ax
;  mov [bx+2],dx
;  add bl,bh
;%define INT_X bx-1     ; x = 2^32 * (-0.5..0.5)
;%define INT_Y bx       ; y = 0xcccd * 320 * (-100..100) = 2^32 * (-0.3906..0.3906)

  ; 10 bytes, free bh
  inc bx
  mov [bx],ax     ; 0x5480: X = dl:ah:al:__
  inc bx          ;             +3 +2 +1 +0
  mov [bx+si],dx  ; 0xf460: Y = dh:dl:__:__
  inc bx
  mov [bx],dx
  inc bx
%define INT_X bx       ; x = 2^32 * (-0.5..0.5)
%define INT_Y bx+si    ; y = 0xcccd * 320 * (-100..100) = 2^32 * (-0.3906..0.3906)

  loop B         ; di+=4 bx=0x490
  dec di
  mov bl,0x80

%define x xmm0 ; XYZ coordinates in the fractal iteration
%define y xmm1
%define z xmm2
%define o xmm3 ; output: orbit trap
%define a xmm4 ; scratch, output: estimated distance
%define b xmm5 ; scratch
%define c xmm6 ; translation [-c,-c/4,0]
%define d xmm7 ; depth (camera Z)

;Trace steps along a ray
  mov bp,0xa2e0-0x9fe0+0x5000+0x20   ; 0x5320
  mov ch,3      ; 24 steps: +0x0ffe * 16 * 24 = 0x17fd00
  movaps d,[K_NEG_Z0]; d=-1
T call MAP
  subps d,a     ; d -= -map(X,Y,d)
  jnz T

;Compute normal.Z (scaled by ambient occlusion)
  movaps [si],a    ; store last step
  addps d,[K_NEG_EPS]
  addps d,a     ; undo last "subps d,a"
  call MAP      ; a = -map(X,Y,d+EPS)     cx=0x00e0
  addps d,[K_NEG_Z0]; d-=1: d = -2..0

;Clip by the far plane, reject normals pointing away
  subps a,[si]     ; a = -(map(X,Y,d+EPS) - map(X,Y,d))
  andps d,a

;Store brightness and hue
  addps a,[K_BRIGHT_MAGIC] ; shift the value into the lowest float byte
  addps o,[K_HUE_MAGIC]
  movaps [bx+si],a    ; 0x0480
  movaps [bx],o ; 0xa440

  movmskps ebp,d ; a<0 and d>=0? ok : 0 (background or grazing hit)

;Next pixel
  inc di
  jnz X   ; ax=di=0

;Esc test, next frame
  in al,0x60
  dec ax  ; ah=0 from the last "mul di"
  jnz M   ; fallthrough

;Return the box distance to the KIFS fractal
MAP:            ; bx=0x480 sp=0x0ffe
;  movups x,[INT_X]
;  cvtdq2ps x,x
;  cvtdq2ps y,[INT_Y]
  cvtdq2ps x,[INT_X]
  cvtdq2ps y,[INT_Y]

  movaps c,[K_TRANSLATION] ; c=-1/4: translation=[-c,-c/4,0]
  movaps o,c    ; o=-1/4
  movaps z,d

;Rotate in the XZ, YX and ZY planes
L mov bl,0x20   ; ch=0 on init
R movaps b,[COS]; b=C3 a=S3 | b=C2 a=S2 | b=C1 a=S1
  movaps a,[SIN]
  mulps b,z     ; b=Cz
  mulps z,a     ; z=Sz
  mulps a,x     ; a=Sx
  mulps x,[COS] ; x=Cx
  subps a,b     ; a=x'=Sx-Cz
  addps z,x     ; z=z'=Sz+Cx
  movaps x,y    ; cycle x,y,z <- y,z,a
  movaps y,z
  movaps z,a
  add bl,0x20   ; 0x20 | 0x40 | 0x60
  jns R         ; bx=0x480 a=z

;Reflect along X and Y
  movaps b,[K_NEG_2]
  orps x,b      ; x=-|x|
  orps y,b      ; y=-|y|

;Box-distance (L_inf) to the origin
  orps a,b      ; a=-|z|
  minps a,x
  minps a,y     ; a=-length = min(-|x|,-|y|,-|z|)

;Orbit trap
  minps o,a     ; orbit=min(orbit,-length)

;Translate by [-c,-c/4,0]
  mulps b,[K_NEG_HALF_SCALE]  ; b=0.25  -2 * -0.125 = 0.25
  mulps b,c     ; b=c/4
  subps x,c     ; x-=c
  subps y,b     ; y-=c/4

;Scale translation
  subps c,b     ; c-=c/4 (c*=3/4)

;Next iteration
  add cx,sp     ; +0x0ffe
  jnc L

  subps a,c
  subps a,c     ; a=-(length-2*c)
  ret           ; bx=0x480
